Chapter 4 MAG catalogue

4.1 Gut microbiota

load("data/gut/data.Rdata")

4.1.1 Genome phylogeny

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate new species table
newspecies_table <- genome_metadata %>%
  mutate(newspecies=ifelse(species=="s__","Y","N")) %>%
  select(genome,newspecies) %>%
  column_to_rownames(var = "genome")

# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
# circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.55, width=0.1, colnames=FALSE) +
#         scale_fill_manual(values=phylum_colors) +
#         geom_tiplab2(size=1, hjust=-0.1) +
#         theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
  labs(fill="Phylum")
        #theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.24,
                pwidth = 0.1,
                orientation="y",
              stat="identity")+
  labs(fill="Contamination")

# Add genome-size ring
circular_tree <-  circular_tree + new_scale_fill()

circular_tree <- gheatmap(circular_tree, newspecies_table, offset=0.3, width=0.05, colnames=FALSE) +
  scale_fill_manual(values=c("#f4f4f4","#74C8AE"))+
  labs(fill="New species")
        #theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))


circular_tree <-  circular_tree +
  geom_fruit(
    data=genome_metadata,
    geom=geom_bar,
    mapping = aes(x=length, y=genome),
    fill = "#1e6e55",
    offset = 0.05,
    orientation="y",
    stat="identity")


# Add text
circular_tree <-  circular_tree +
        annotate('text', x=3.1, y=0, label='            Phylum', family='arial', size=3.5) +
        annotate('text', x=3.7, y=0, label='                         Genome quality', family='arial', size=3.5) +
        annotate('text', x=4.1, y=0, label='                     Genome size', family='arial', size=3.5) +
        annotate('text', x=3.4, y=0, label='                     New species', family='arial', size=3.5)

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

4.1.2 Taxonomy overview

tax_mag <-genome_metadata %>%
  group_by(phylum) %>%
  summarise(mag_n=n())

tax_mag %>%
  mutate(percetage_mag=round(mag_n*100/sum(mag_n), 2)) %>%
  arrange(-percetage_mag) %>%
  tt()
phylum mag_n percetage_mag
p__Bacteroidota 191 35.44
p__Bacillota_A 175 32.47
p__Pseudomonadota 87 16.14
p__Bacillota 23 4.27
p__Verrucomicrobiota 18 3.34
p__Cyanobacteriota 12 2.23
p__Bacillota_C 11 2.04
p__Desulfobacterota 7 1.30
p__Fusobacteriota 6 1.11
p__Bacillota_B 4 0.74
p__Deferribacterota 2 0.37
p__Elusimicrobiota 2 0.37
p__Chlamydiota 1 0.19

4.1.3 Mag size (MB)

genome_metadata <- genome_metadata %>%
  mutate(corrected_size=100*length/completeness) %>%
  arrange(completeness)
genome_metadata %>%
  summarise(Average_corrected_size=mean(corrected_size))
# A tibble: 1 × 1
  Average_corrected_size
                   <dbl>
1               3245297.

4.1.4 Genome quality

genome_metadata %>%
    summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(),
              completeness_sd=sd(completeness) %>% round(2) %>% as.character(),
              contamination_mean=mean(contamination) %>% round(2),
              contamination_sd=sd(contamination) %>% round(2)) %>%
    unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
    unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
    tt()
Completeness Contamination
85.01 ± 16.2 1.68 ± 2.25
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  select(c(genome,domain,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_void() +
                    theme(legend.position = "none",
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        axis.text.y=element_blank(),
                        axis.ticks.y=element_blank(),
                        axis.text.x=element_blank(),
                        axis.ticks.x=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(50,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_void() +
                theme(legend.position = "none",
                    axis.title.x = element_blank(),
                    axis.title.y = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

4.1.5 Functional overview

4.1.5.1 Predicted genes

genome_annotations <- read_tsv("data/gut/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

# Predicted genes
pred_genes <- genome_annotations %>%
  nrow()

cat(pred_genes)
1219667

4.1.5.2 Number of annotated genes and percentages

# Some annotation
genome_annota <- genome_annotations %>%
  filter(if_any(c(kegg_id, pfam_hits, cazy_hits), ~ !is.na(.))) %>%
  nrow()

cat(genome_annota)
942120
# Some annotation percentage
genome_annota*100/pred_genes
[1] 77.24403

4.1.5.3 Number of KEGG annotatated genes and percentages

# KEGG annotation
kegg_annota <- genome_annotations %>%
  filter(!is.na(kegg_id)) %>%
  nrow()
cat(kegg_annota)
557611
# KEGG annotation percentage
kegg_annota*100/genome_annota
[1] 59.18683
# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

4.1.6 Functional ordination

gift_pcoa <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="euclidean") %>%
    pcoa()

gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
  as.data.frame() %>% 
  select(Axis.1,Axis.2) # keep the first 2 axes

gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]

gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*% diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% 
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings
gift_pcoa_vectors %>% 
  rownames_to_column(var="genome") %>% 
  left_join(genome_metadata, by="genome") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=phylum_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length), 
                 alpha=0.9, shape=16) +
      #scale_color_manual(values=phylum_colors) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-4.5,4.5) + 
    ylim(-3,2.5) +
    theme_minimal() + 
    theme(legend.position = "none")

4.2 Skin microbiota

load("data/skin/data.Rdata")
genome_gifts <- genome_gifts[, colSums(genome_gifts != 0) > 0]

4.2.1 Genome phylogeny

# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
    right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
    arrange(match(genome, genome_tree$tip.label)) %>%
    select(genome,phylum) %>%
    mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
    column_to_rownames(var = "genome")

# Generate new species table
newspecies_table <- genome_metadata %>%
  mutate(newspecies=ifelse(species=="s__","Y","N")) %>%
  select(genome,newspecies) %>%
  column_to_rownames(var = "genome")

# Generate  basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
    ggtree(., layout="fan", open.angle=10, size=0.5)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum ring
# circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.55, width=0.1, colnames=FALSE) +
#         scale_fill_manual(values=phylum_colors) +
#         geom_tiplab2(size=1, hjust=-0.1) +
#         theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
        scale_fill_manual(values=phylum_colors) +
  labs(fill="Phylum")
        #theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))

# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add completeness ring
circular_tree <- circular_tree +
        new_scale_fill() +
        scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
        geom_fruit(
                data=genome_metadata,
                geom=geom_bar,
                mapping = aes(x=completeness, y=genome, fill=contamination),
                offset = 0.24,
                pwidth = 0.1,
                orientation="y",
              stat="identity")+
  labs(fill="Contamination")

# Add genome-size ring
circular_tree <-  circular_tree + new_scale_fill()

circular_tree <- gheatmap(circular_tree, newspecies_table, offset=0.3, width=0.05, colnames=FALSE) +
  scale_fill_manual(values=c("#f4f4f4","#74C8AE"))+
  labs(fill="New species")
        #theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))


circular_tree <-  circular_tree +
  geom_fruit(
    data=genome_metadata,
    geom=geom_bar,
    mapping = aes(x=length, y=genome),
    fill = "#1e6e55",
    offset = 0.05,
    orientation="y",
    stat="identity")


# Add text
circular_tree <-  circular_tree +
        annotate('text', x=2.2, y=0, label='          Phylum', family='arial', size=3.5) +
        annotate('text', x=2.7, y=0, label='                        Genome quality', family='arial', size=3.5) +
        annotate('text', x=3.0, y=0, label='                    Genome size', family='arial', size=3.5) +
        annotate('text', x=2.5, y=0, label='                    New species', family='arial', size=3.5)

#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)

4.2.2 Taxonomy overview

genome_metadata %>%
  group_by(phylum) %>%
  summarise(mag_n=n()) %>%
  arrange(-mag_n) %>%
  tt()
phylum mag_n
p__Pseudomonadota 35
p__Bacteroidota 6
p__ 2

4.2.3 Genome quality

genome_metadata %>%
    summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(),
              completeness_sd=sd(completeness) %>% round(2) %>% as.character(),
              contamination_mean=mean(contamination) %>% round(2),
              contamination_sd=sd(contamination) %>% round(2)) %>%
    unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
    unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
    tt()
Completeness Contamination
77.24 ± 17.04 4.48 ± 2.42
#Generate quality biplot
genome_biplot <- genome_metadata %>%
  select(c(genome,domain,phylum,completeness,contamination,length)) %>%
  arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
  ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
              geom_point(alpha=0.7) +
                    ylim(c(10,0)) +
                    scale_color_manual(values=phylum_colors) +
                    labs(y= "Contamination", x = "Completeness") +
                    theme_classic() +
                    theme(legend.position = "none")

#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
            ggplot(aes(y=contamination)) +
                    ylim(c(10,0)) +
                    geom_boxplot(colour = "#999999", fill="#cccccc") +
                    theme_void() +
                    theme(legend.position = "none",
                        axis.title.x = element_blank(),
                        axis.title.y = element_blank(),
                        axis.text.y=element_blank(),
                        axis.ticks.y=element_blank(),
                        axis.text.x=element_blank(),
                        axis.ticks.x=element_blank(),
                        plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)

#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
        ggplot(aes(x=completeness)) +
                xlim(c(50,100)) +
                geom_boxplot(colour = "#999999", fill="#cccccc") +
                theme_void() +
                theme(legend.position = "none",
                    axis.title.x = element_blank(),
                    axis.title.y = element_blank(),
                    axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),
                    axis.text.x=element_blank(),
                    axis.ticks.x=element_blank(),
                    plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)

#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
        layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3),
                              c(2,2,2,2,2,2,2,2,2,2,2,3)))

4.2.4 Functional overview

4.2.4.1 Predicted genes

genome_annotations <- read_tsv("data/skin/genome_annotations.tsv.xz") %>%
    rename(gene=1, genome=2, contig=3)

# Predicted genes
pred_genes <- genome_annotations %>%
  nrow()

cat(pred_genes)
114303

4.2.4.2 Number of annotated genes and percentages

# Some annotation
genome_annota <- genome_annotations %>%
  filter(if_any(c(kegg_id, pfam_hits, cazy_hits), ~ !is.na(.))) %>%
  nrow()

cat(genome_annota)
88006
# Some annotation percentage
genome_annota*100/pred_genes
[1] 76.9936

4.2.4.3 Number of KEGG annotatated genes and percentages

# KEGG annotation
kegg_annota <- genome_annotations %>%
  filter(!is.na(kegg_id)) %>%
  nrow()
cat(kegg_annota)
52991
# KEGG annotation percentage
kegg_annota*100/genome_annota
[1] 60.21294
# Aggregate basal GIFT into elements
function_table <- genome_gifts %>%
    to.elements(., GIFT_db)

# Generate  basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
                ggtree(., size = 0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
            scale_fill_manual(values=phylum_colors) +
            labs(fill="Phylum")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
            vexpand(.08) +
            coord_cartesian(clip = "off") +
            scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
            labs(fill="GIFT")

#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()

# Add completeness barplots
function_tree <- function_tree +
            geom_fruit(data=genome_metadata,
            geom=geom_bar,
            grid.params=list(axis="x", text.size=2, nbreak = 1),
            axis.params=list(vline=TRUE),
            mapping = aes(x=length, y=genome, fill=completeness),
                 offset = 3.8,
                 orientation="y",
                 stat="identity") +
            scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
            labs(fill="Genome\ncompleteness")

function_tree

4.2.5 Functional ordination

gift_pcoa <- genome_gifts %>%
    to.elements(., GIFT_db) %>%
    as.data.frame() %>%
    vegdist(method="euclidean") %>%
    pcoa()

gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]


# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
  as.data.frame() %>% 
  select(Axis.1,Axis.2) # keep the first 2 axes

gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]

gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*% diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
  as.data.frame() %>% 
  rename(Axis.1=1,Axis.2=2) %>% 
  rownames_to_column(var="label") %>% 
  #get function summary vectors
  mutate(func=substr(label,1,3)) %>% 
  group_by(func) %>% 
  summarise(Axis.1=mean(Axis.1),
            Axis.2=mean(Axis.2)) %>% 
  rename(label=func) %>% 
  filter(!label %in% c("S01","S02","S03"))
set.seed(101)
scale <- 20 # scale for vector loadings
gift_pcoa_vectors %>% 
  rownames_to_column(var="genome") %>% 
  left_join(genome_metadata, by="genome") %>%
  ggplot() +
      #genome positions
      scale_color_manual(values=phylum_colors)+
      geom_point(aes(x=Axis.1,y=Axis.2, color=phylum, size=length), 
                 alpha=0.9, shape=16) +
      #scale_color_manual(values=phylum_colors) +
      scale_size_continuous(range = c(0.1,5)) +
      #loading positions
      geom_segment(data=gift_pcoa_gifts, 
                   aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
                    arrow = arrow(length = unit(0.3, "cm"), 
                    type = "open", 
                    angle = 25),
                    linewidth = 0.5, 
                    color = "black") +
     #Primary and secondary scale adjustments
     scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
            ) +
     scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
                      sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
            ) +
    geom_label_repel(data = gift_pcoa_gifts,
                     aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
                     segment.color = 'transparent') +
    xlim(-4.5,4.5) + 
    ylim(-3,2.5) +
    theme_minimal() + 
    theme(legend.position = "none")

# Generate the tSNE ordination
set.seed(1001)
tSNE_function <- Rtsne(X=function_table, dims = 2, perplexity = 13, check_duplicates = FALSE)

# Plot the ordination
function_ordination <- tSNE_function$Y %>%
                as.data.frame() %>%
                mutate(genome=rownames(function_table)) %>%
                inner_join(genome_metadata, by="genome") %>%
                rename(tSNE1="V1", tSNE2="V2") %>%
                select(genome,phylum,tSNE1,tSNE2, length) %>%
                ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
                            geom_point(shape=16, alpha=0.7) +
                            scale_color_manual(values=phylum_colors) +
                            ylim(-30,40)+
                            xlim(-40,40)+
                            theme_minimal() +
                labs(color="Phylum", size="Genome size") +
                guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend

function_ordination
Warning: Removed 34 rows containing missing values or values outside the scale range (`geom_point()`).